#!/usr/bin/env python
# -*- coding: utf-8 -*-

import pandas as pd
import numpy as np
import matplotlib.pyplot as plot
import time
from sklearn import preprocessing
from sklearn.datasets.california_housing import fetch_california_housing   #导入房价数据
from sklearn import tree   # 导入树模块，可以使用树模块建立决策树模型进行分类或者回归
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV     # 对估计量的指定参数值进行详尽搜索，对多有可能的参数遍历，寻找最好的参数
from sklearn.ensemble import RandomForestRegressor   # 随机森林
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier, export_graphviz  # 决策树分类器
from sklearn.ensemble import RandomForestClassifier   # 随机森林模块
import pydotplus
import os
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.kernel_approximation import Nystroem
from sklearn.kernel_approximation import RBFSampler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import roc_curve
from sklearn import preprocessing
from sklearn.datasets.california_housing import fetch_california_housing   #导入房价数据
from sklearn import tree   # 导入树模块，可以使用树模块建立决策树模型进行分类或者回归
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV     # 对估计量的指定参数值进行详尽搜索，对多有可能的参数遍历，寻找最好的参数
from sklearn.ensemble import RandomForestRegressor   # 随机森林

def run_sign_model():
    SEED = 222
    np.random.seed(SEED)
    '''设置pandas可以显示的最大行数'''
    pd.options.display.max_columns = 50
    '''可视化树模型时需要添加的环境变量'''
    os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'
    df = pd.read_csv('E:\培训教程\python\唐宇迪-机器学习课程\机器学习算法配套案例实战\集成算法\input.csv')
    '''
        * cand_pty_affiliation：我们要预测的指标，共和党或者民主党
        * entity_tp：个人还是组织
        * classification:领域
        * rpt_tp：贡献的大小
        * cycle：捐赠在哪年
        * transaction_amt：捐献金额
    '''
    '''对数据集进行切分,ytrain和ytest是标签数据'''
    xtrain, xtest, ytrain, ytest = get_train_test(df)
    '''
    print(xtrain)
    print(xtest)
    print(ytrain)
    print(ytest)
    实例化一个决策树，最大深度是1，指定随机种子
    '''
    t1 = DecisionTreeClassifier(max_depth=1, random_state=SEED)
    '''训练决策树,使用训练集的数据训练模型'''
    t1.fit(xtrain, ytrain)
    '''用训练的模型预测数据，取预测数据的第一列，预测是的正确的概率值'''
    p = t1.predict_proba(xtest)[:,1]
    print("Decision_signle tree ROC-AUC score: %.3f" % roc_auc_score(ytest, p))
    '''可视化决策树看起来这个决策树没啥用，所有的预测结果竟然都是一样的！'''
    print_graph(t1, xtrain.columns,'Decision_signle')
    '''调整参数，重新生成决策树，调整竖得最大深度为3'''
    t2 = DecisionTreeClassifier(max_depth=3, random_state=SEED)
    t2.fit(xtrain, ytrain)
    p2 = t2.predict_proba(xtest)[:,1]
    '''47.3%的样本落到了最左边, 还有35.9% 落在了基本最右边. 这看起来模型基本已经过拟合了'''
    print("Decision tree_max_depth_3 ROC-AUC score: %.3f" % roc_auc_score(ytest, p2))
    print_graph(t2, xtrain.columns,'Decision tree_max_depth_3')

    '''我们删除影响最大的因素，在次查看决策树,因为最终结果对这列依赖太大'''
    drop = ["transaction_amt"]
    xtrain_slim = xtrain.drop(drop, 1)
    xtest_slim = xtest.drop(drop,1)
    t3 = DecisionTreeClassifier(max_depth=3, random_state=SEED)
    t3.fit(xtrain_slim, ytrain)
    p3 = t3.predict_proba(xtest_slim)[:,1]
    print("Decision tree_drop ROC-AUC score: %.3f" % roc_auc_score(ytest, p3))
    print_graph(t3, xtrain_slim.columns,'Decision tree_drop')
    '''从这两个树来看，模型的评估结果差不多，但是树看起来很不一样！它们各自都有错误，那我们能不能综合利用它们呢？'''
    p = np.mean([p2, p3], axis=0)
    '''
    取上面两颗树的均值之后，在和ytest计算面积，面积值有所提高，这就是随机森林的思路
    在计算auc面积值时，传入真实值和预测值，会计算auc面积
    '''
    print("Average of decision tree ROC-AUC score: %.3f" % roc_auc_score(ytest, p))

    '''使用sklearn随机森林计算AUC面积'''
    '''n_estimators 树的数量，增加树的数量，可以提高auc面积'''
    rf = RandomForestClassifier(
        n_estimators=50,
        max_features=3,
        random_state=SEED)
    rf.fit(xtrain, ytrain)
    p = rf.predict_proba(xtest)[:, 1]
    print("Average of decision_add_tree tree ROC-AUC score: %.3f" % roc_auc_score(ytest, p))


def get_train_test(df,test_size=0.9):
    """Split Data into train and test sets."""
    '''将将要预测的标签数据转换为0,1的数据，方便预测，原来是字符串'''
    y = 1 * (df.cand_pty_affiliation == "REP")
    '''从数据集中去掉标签数据'''
    X = df.drop(["cand_pty_affiliation"], axis=1)
    '''
    将数据都转换成0,1数据
    http://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.get_dummies.html?highlight=get_dummies#pandas.get_dummies
    '''
    X = pd.get_dummies(X, sparse=True)
    '''删除标准差为0的列数据(标准差为0说明此列中的数据全都相等)'''
    X.drop(X.columns[X.std() == 0], axis=1, inplace=True)
    '''test_size 测试集数据占得百分比，这里测试集数据占比10%，加入random_state参数后，每次随机分割的数据是一样的'''
    return train_test_split(X, y, test_size=test_size, random_state=222)

def print_graph(clf, feature_names,picture_name):
    """Print decision tree."""
    graph = export_graphviz(
        clf,
        label="root",
        proportion=True,
        impurity=False,
        out_file=None,
        feature_names=feature_names,
        class_names={0: "D", 1: "R"},
        filled=True,
        rounded=True
    )
    graph = pydotplus.graph_from_dot_data(graph)
    return graph.write_png(picture_name+'.png')

def run_multi_models():
    SEED = 222
    np.random.seed(SEED)
    pd.options.display.max_columns = 50
    os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'
    '''
    * cand_pty_affiliation：我们要预测的指标，共和党或者民主党
    * entity_tp：个人还是组织
    * classification:领域
    * rpt_tp：贡献的大小
    * cycle：捐赠在哪年
    * transaction_amt：捐献金额
    '''
    df = pd.read_csv('E:\培训教程\python\唐宇迪-机器学习课程\机器学习算法配套案例实战\集成算法\input.csv')
    df = df.head(10000)
    xtrain, xtest, ytrain, ytest = get_train_test(df)
    models = get_models()
    P = train_predict(models,xtrain,ytrain,xtest,ytest)
    score_models(P, ytest)
'''使用多种模型进行集成'''
def get_models():
   """Generate a library of base learners."""
   SEED = 222
   np.random.seed(SEED)
   nb = GaussianNB()
   svc = SVC(C=100, probability=True)
   knn = KNeighborsClassifier(n_neighbors=3)
   lr = LogisticRegression(C=100, random_state=SEED)
   nn = MLPClassifier((80, 10), early_stopping=False, random_state=SEED)
   gb = GradientBoostingClassifier(n_estimators=100, random_state=SEED)
   rf = RandomForestClassifier(n_estimators=10, max_features=3, random_state=SEED)
   models = {'svm': svc,
             'knn': knn,
             'naive bayes': nb,
             'mlp-nn': nn,
             'random forest': rf,
             'gbm': gb,
             'logistic': lr,
             }
   return models

def train_predict(model_list,xtrain,ytrain,xtest,ytest):
     """Fit models in list on training set and return preds"""
     P = np.zeros((ytest.shape[0], len(model_list)))
     P = pd.DataFrame(P)
     print("Fitting models.")
     cols = list()
     for i, (name, m) in enumerate(model_list.items()):
         print("%s..." % name, end=" ", flush=False)
         m.fit(xtrain, ytrain)
         P.iloc[:, i] = m.predict_proba(xtest)[:, 1]
         cols.append(name)
         print("done")

     P.columns = cols
     print(P)
     print("Done.\n")
     return P


def score_models(P, y):
     """Score model in prediction DF"""
     print("Scoring models.")
     for m in P.columns:
         '''只取单独的一列，传入真实值和预测值，计算auc面积'''
         score = roc_auc_score(y, P.loc[:, m])
         print("%-26s: %.3f" % (m, score))
     print("Done.\n")

def run_simple_dec_tree():
    os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'
    housing = fetch_california_housing()

    '''
    test_size = 0.1 从数据中取10%作为测试集，其余为训练集
    random_state = 42 随机种子
    data_train, data_test, target_train, target_test  训练集的x，测试集的x，训练集的y，测试集的y
    将训练集在切分为训练集和验证集；用验证集验证完成之后，在使用真实的测试集测试(测试数据非常宝贵的情况下),作为交叉验证使用
    '''
    data_train, data_test, target_train, target_test = train_test_split(housing.data, housing.target, test_size = 0.1, random_state = 42)
    dtr = tree.DecisionTreeRegressor(random_state=42)
    dtr.fit(data_train, target_train)

    print(dtr.predict(data_test).size)

    rfr = RandomForestRegressor(random_state=42)
    rfr.fit(data_train, target_train)
    rfr.score(data_test, target_test)


    '''
    # 设定min_samples_split参数的可能是list((3, 6, 9)),，n_estimators参数的可能是list((10, 50, 100))
    # 在9中参数组合中寻找最好的参数
    '''
    tree_param_grid = {'min_samples_split': list((3, 6, 9)), 'n_estimators': list((10, 50, 100))}
    '''
    cv 交叉验证的次数，cv=5，即将训练集划分为5份，每次用4份作为训练集,另外一份作为验证集
    GridSearchCV 在不知道该如何选择参数的情况下，遍历可能的参数，选择得分最高的一组参数
    '''
    grid = GridSearchCV(RandomForestRegressor(), param_grid=tree_param_grid, cv=3)
    '''用前面划分的训练集中的训练集和验证集训练模型'''
    grid.fit(data_train, target_train)

    '''输出交叉验证后参数组成中最好的分数'''
    print(grid.best_score_)
    '''输出交叉验证后参数组合中最好的参数，和最好的分数对应'''
    print(grid.best_params_)
    print('scorer : ' ,grid.scorer_)
    print(grid.predict(data_test))

if __name__ == '__main__':
    run_simple_dec_tree()